In [1]:
import sklearn
In [2]:
#pip install scikit-learn==1.4.2
In [3]:
print(sklearn.__version__)
1.4.2

powinno byc 1.2.2 (inaczej moze nie dzialac :( )

1. Packages

In [4]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier



from sklearn.model_selection import RandomizedSearchCV
warnings.filterwarnings('ignore')

2. Wczytywanie danych

In [31]:
data = pd.read_csv('../Data/our_data.csv')
X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)

X_train = X_train.drop(['Compactness','EquivDiameter', 'Area'], axis=1)
X_val = X_val.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_test = X_test.drop(['Compactness','EquivDiameter','Area'], axis=1)
cols = X_train.columns

3. Scaling

In [32]:
scaling = sklearn.preprocessing.PowerTransformer(method='box-cox')
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)
X_val = scaling.transform(X_val)

X_train = pd.DataFrame(X_train, columns=cols)
X_test = pd.DataFrame(X_test, columns=cols)
X_val = pd.DataFrame(X_val, columns=cols)

4. Encoding

In [33]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y_train.to_frame())
y_encoded = pd.DataFrame(enc.transform(y_train.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_val_encoded = pd.DataFrame(enc.transform(y_val.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_test_encoded = pd.DataFrame(enc.transform(y_test.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))

#standard encoding 0,1,2,...
labelencoder = sklearn.preprocessing.LabelEncoder()
y_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_train))
y_val_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_val))
y_test_encoded2 = pd.DataFrame( labelencoder.fit_transform(y_test))
In [34]:
X_train.columns
Out[34]:
Index(['Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation',
       'Eccentricity', 'ConvexArea', 'Extent', 'Solidity', 'roundness',
       'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4'],
      dtype='object')
In [35]:
class_names = ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']

5. Strojenie parametrów i kroswalidacja

5.1 Strojenie parametrów

In [10]:
def hyperparameters_tuner(estimator, param_distributions, X, y, cv=5, n_iter=10, random_state=42):
    random_search = RandomizedSearchCV(estimator, param_distributions=param_distributions, n_iter=n_iter, cv=cv, random_state=random_state)
    random_search.fit(X, y)
    return random_search.best_params_
In [11]:
def plot_confusion_matrix (y_true, y_pred, class_names):
    cm = confusion_matrix(y_true, y_pred, labels=class_names)
    fig = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count",text_auto='.2f'), x=class_names, y=class_names)
    fig.update_xaxes(side="top")
    for i in range(len(class_names)):
        for j in range(len(class_names)):
            fig.add_annotation(x=class_names[j], y=class_names[i], text=str(cm[i, j]), showarrow=False,font=dict(color="black" if cm[i, j] > cm.max()/2 else "white"))
    
    fig.show()

5.2 Kroswalidacja

In [12]:
def train_evaluate_encoded2(estimator, param_distributions, X_train, y_train, X_val, y_val, cv=5, class_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']):
    best_params = hyperparameters_tuner(estimator, param_distributions, X_train, y_train)
    best_model = estimator.set_params(**best_params)
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_val)
    

    accuracy = accuracy_score(y_val, y_pred)

    cv_results = cross_val_score(best_model, X_train, y_train, cv=cv)
    cv_val_results = cross_val_score(best_model, X_val, y_val, cv=cv)

    y_pred = labelencoder.inverse_transform(y_pred)
    y_val = labelencoder.inverse_transform(y_val)

    print(f"Best parameters: {best_params}")
    print('__________________________________________________________')
    print(f"Accuracy: {accuracy}")
    plot_confusion_matrix(y_val, y_pred, class_names)
    print(classification_report(y_val, y_pred, target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))
    print("__________________________________________________________")
    print(f"Cross-validation results: {cv_results}")
    print(f"Mean accuracy: {cv_results.mean()}")
    print(f"Cross-validation results on validation set: {cv_val_results}")
    print(f"Mean accuracy on validation set: {cv_val_results.mean()}")
    return best_params, best_model
In [13]:
def train_evaluate(estimator, param_distributions, X_train, y_train, X_val, y_val, cv=5, class_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']):
    best_params = hyperparameters_tuner(estimator, param_distributions, X_train, y_train)
    best_model = estimator.set_params(**best_params)
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_val)
    

    accuracy = accuracy_score(y_val, y_pred)

    cv_results = cross_val_score(best_model, X_train, y_train, cv=cv)
    cv_val_results = cross_val_score(best_model, X_val, y_val, cv=cv)

    print(f"Best parameters: {best_params}")
    print('__________________________________________________________')
    print(f"Accuracy: {accuracy}")
    plot_confusion_matrix(y_val, y_pred, class_names)
    print(classification_report(y_val, y_pred, target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))
    print("__________________________________________________________")
    print(f"Cross-validation results: {cv_results}")
    print(f"Mean accuracy: {cv_results.mean()}")
    print(f"Cross-validation results on validation set: {cv_val_results}")
    print(f"Mean accuracy on validation set: {cv_val_results.mean()}")
    return best_params, best_model

6. Modelowanie

z autoML tpot był robiony i prezentacji jest ale się z kodu usunął, zasugerowany model to był MLP coś tam

6.1 Regresja logistyczna

In [14]:
dist = dict(C=[10 ** x for x in range(-4, 3)], penalty=['l2', 'l1'])
lr = LogisticRegression(max_iter=1000,solver='saga', multi_class='multinomial')
lr_best_params, lr_best = train_evaluate_encoded2(lr, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
Best parameters: {'penalty': 'l1', 'C': 100}
__________________________________________________________
Accuracy: 0.9291338582677166
              precision    recall  f1-score   support

    BARBUNYA       0.94      0.92      0.93       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.96      0.92      0.94       276
    DERMASON       0.93      0.93      0.93       604
       HOROZ       0.93      0.95      0.94       319
       SEKER       0.96      0.96      0.96       339
        SIRA       0.87      0.88      0.88       441

    accuracy                           0.93      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.93      0.93      0.93      2286

__________________________________________________________
Cross-validation results: [0.92590164 0.93110236 0.93044619 0.92519685 0.92650919]
Mean accuracy: 0.927831246504023
Cross-validation results on validation set: [0.91484716 0.92778993 0.93435449 0.95185996 0.91466083]
Mean accuracy on validation set: 0.9287024738899028

model dziala dobrze, crossvalidacja nie wykrywa over ani underfittingu

6.2 Random Forest

In [60]:
data = pd.read_csv('../Data/our_data.csv')
X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)

X_train_filtered = X_train.drop(['Compactness','EquivDiameter', 'Area'], axis=1)
X_val_filtered = X_val.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_test_filtered = X_test.drop(['Compactness','EquivDiameter','Area'], axis=1)
cols = X_train.columns
In [61]:
scaling = sklearn.preprocessing.StandardScaler()

X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)
X_val = scaling.transform(X_val)

X_train = pd.DataFrame(X_train, columns=cols)
X_test = pd.DataFrame(X_test, columns=cols)
X_val = pd.DataFrame(X_val, columns=cols)
In [62]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y_train.to_frame())
y_encoded = pd.DataFrame(enc.transform(y_train.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_val_encoded = pd.DataFrame(enc.transform(y_val.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_test_encoded = pd.DataFrame(enc.transform(y_test.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))

#standard encoding 0,1,2,...
labelencoder = sklearn.preprocessing.LabelEncoder()
y_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_train))
y_val_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_val))
y_test_encoded2 = pd.DataFrame( labelencoder.fit_transform(y_test))
In [18]:
dist = dict(n_estimators=[5, 10, 25, 50, 100, 200, 250, 500, 1000],
            criterion=['gini', 'entropy', 'log_loss'],
            max_depth=[1, 5, 10, 25, 50, 100, 150],
            min_samples_split=[1, 5, 10, 25, 50, 100, 250, 500])
rf = RandomForestClassifier(random_state=42)
rf_best_params, rf_best = train_evaluate_encoded2(rf, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
Best parameters: {'n_estimators': 200, 'min_samples_split': 5, 'max_depth': 25, 'criterion': 'log_loss'}
__________________________________________________________
Accuracy: 0.9243219597550306
              precision    recall  f1-score   support

    BARBUNYA       0.93      0.93      0.93       222
      BOMBAY       1.00      0.99      0.99        85
        CALI       0.95      0.92      0.94       276
    DERMASON       0.91      0.93      0.92       604
       HOROZ       0.94      0.95      0.94       319
       SEKER       0.96      0.96      0.96       339
        SIRA       0.87      0.86      0.87       441

    accuracy                           0.92      2286
   macro avg       0.94      0.93      0.94      2286
weighted avg       0.92      0.92      0.92      2286

__________________________________________________________
Cross-validation results: [0.9147541  0.92782152 0.92454068 0.92388451 0.92716535]
Mean accuracy: 0.9236332343702939
Cross-validation results on validation set: [0.91266376 0.9059081  0.92997812 0.92997812 0.89496718]
Mean accuracy on validation set: 0.9146990530610685

6.3 SVC

In [19]:
svc = SVC()
dist = {'C': [0.1, 1, 10, 100, 1000,10000], 'gamma': [1, 0.01, 0.0001], 'kernel': ['rbf'
                                                                                   , 'poly', 'sigmoid' 
                                                                                   ]}
svc_best_params, svc_best = train_evaluate_encoded2(svc, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
Best parameters: {'kernel': 'rbf', 'gamma': 0.01, 'C': 10000}
__________________________________________________________
Accuracy: 0.9278215223097113
              precision    recall  f1-score   support

    BARBUNYA       0.92      0.92      0.92       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.95      0.91      0.93       276
    DERMASON       0.91      0.95      0.93       604
       HOROZ       0.94      0.95      0.94       319
       SEKER       0.97      0.95      0.96       339
        SIRA       0.89      0.86      0.88       441

    accuracy                           0.93      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.93      0.93      0.93      2286

__________________________________________________________
Cross-validation results: [0.92459016 0.93569554 0.92913386 0.93175853 0.92782152]
Mean accuracy: 0.9297999225506647
Cross-validation results on validation set: [0.89519651 0.90809628 0.93654267 0.92560175 0.9059081 ]
Mean accuracy on validation set: 0.9142690606098247

6.4 Naive Bayes

In [20]:
nb = GaussianNB()
dist = {'var_smoothing': np.logspace(0,-9, num=100)}
nb_best_params, nb_best= train_evaluate_encoded2(nb, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
Best parameters: {'var_smoothing': 2.848035868435799e-08}
__________________________________________________________
Accuracy: 0.9002624671916011
              precision    recall  f1-score   support

    BARBUNYA       0.89      0.85      0.87       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.91      0.89      0.90       276
    DERMASON       0.93      0.88      0.90       604
       HOROZ       0.91      0.96      0.93       319
       SEKER       0.93      0.96      0.94       339
        SIRA       0.82      0.85      0.84       441

    accuracy                           0.90      2286
   macro avg       0.91      0.91      0.91      2286
weighted avg       0.90      0.90      0.90      2286

__________________________________________________________
Cross-validation results: [0.89180328 0.89895013 0.9015748  0.89238845 0.89501312]
Mean accuracy: 0.8959459575749753
Cross-validation results on validation set: [0.8930131  0.9059081  0.90809628 0.9059081  0.88402626]
Mean accuracy on validation set: 0.8993903662580145

Słabo ale co zrobic, nie bedziemy dalej uzywac

6.5 DecisionTree

In [21]:
dt = DecisionTreeClassifier()
dist  = dict(criterion=['gini', 'entropy', 'log_loss'],
            max_depth=[1, 5, 7, 9, 10,11,15, 20, 50, 75, 100],
            min_samples_split=[1,3,4,5,7, 10, 100, 250, 500])
dt_best_params, dt_best = train_evaluate_encoded2(dt, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
Best parameters: {'min_samples_split': 10, 'max_depth': 50, 'criterion': 'gini'}
__________________________________________________________
Accuracy: 0.8985126859142607
              precision    recall  f1-score   support

    BARBUNYA       0.91      0.88      0.89       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.91      0.90      0.91       276
    DERMASON       0.89      0.91      0.90       604
       HOROZ       0.91      0.94      0.92       319
       SEKER       0.93      0.92      0.93       339
        SIRA       0.85      0.83      0.84       441

    accuracy                           0.90      2286
   macro avg       0.91      0.91      0.91      2286
weighted avg       0.90      0.90      0.90      2286

__________________________________________________________
Cross-validation results: [0.87868852 0.9160105  0.89501312 0.89895013 0.89698163]
Mean accuracy: 0.8971287810335185
Cross-validation results on validation set: [0.88209607 0.87089716 0.91247265 0.87527352 0.84682713]
Mean accuracy on validation set: 0.877513305877519

6.6 KNeighbors

In [22]:
from sklearn.neighbors import KNeighborsClassifier
In [23]:
dist = {'n_neighbors': [3, 5,7,8, 9, 10,11, 12, 15, 21]}
kn = KNeighborsClassifier()

#nie działa mi, może przez sklearn, mam 1.4.2 ale nie zmieniam na inny już w trakcie bo reszta oprócz stackingu też działa
kn_best_params, kn_best = train_evaluate_encoded2(kn, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
Best parameters: {'n_neighbors': 8}
__________________________________________________________
Accuracy: 0.9221347331583553
              precision    recall  f1-score   support

    BARBUNYA       0.96      0.91      0.94       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.94      0.93      0.94       276
    DERMASON       0.90      0.94      0.92       604
       HOROZ       0.92      0.96      0.94       319
       SEKER       0.96      0.95      0.96       339
        SIRA       0.87      0.84      0.85       441

    accuracy                           0.92      2286
   macro avg       0.94      0.93      0.93      2286
weighted avg       0.92      0.92      0.92      2286

__________________________________________________________
Cross-validation results: [0.91803279 0.93372703 0.92913386 0.92257218 0.92650919]
Mean accuracy: 0.9259950088206187
Cross-validation results on validation set: [0.91266376 0.9059081  0.91684902 0.93654267 0.90153173]
Mean accuracy on validation set: 0.9146990530610685

6.7 XGBOOST

In [24]:
from xgboost import XGBClassifier
xgb = XGBClassifier(subsample=0.8, nestimators=300, ma_depth=7, learning_rate=0.1, gamma=0.1, colsample_bytree=0.5)
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'max_depth': [3, 5, 7],  # Maximum depth of the tree
    'learning_rate': [0.01, 0.1, 0.3],  # Step size shrinkage used in update to prevent overfitting
    'subsample': [0.5, 0.8, 1.0],  # Subsample ratio of the training instance
    'colsample_bytree': [0.5, 0.8, 1.0],  # Subsample ratio of columns when constructing each tree
    'gamma': [0, 0.1, 0.2]  # Minimum loss reduction required to make a further partition on a leaf node of the tree
}
xgb_best_params, xgb_best = train_evaluate_encoded2(xgb, param_grid, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
Best parameters: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
__________________________________________________________
Accuracy: 0.9282589676290464
              precision    recall  f1-score   support

    BARBUNYA       0.94      0.92      0.93       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.97      0.92      0.94       276
    DERMASON       0.91      0.94      0.93       604
       HOROZ       0.93      0.96      0.95       319
       SEKER       0.96      0.96      0.96       339
        SIRA       0.88      0.86      0.87       441

    accuracy                           0.93      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.93      0.93      0.93      2286

__________________________________________________________
Cross-validation results: [0.92721311 0.93897638 0.92847769 0.92716535 0.92388451]
Mean accuracy: 0.9291434103523946
Cross-validation results on validation set: [0.90829694 0.89715536 0.93435449 0.93654267 0.89277899]
Mean accuracy on validation set: 0.9138256906156537

6.8 Stacking Classifier

In [66]:
from sklearn.ensemble import StackingClassifier


lr = LogisticRegression(C=100, penalty='l1', max_iter=1000,solver='saga', multi_class='multinomial')
svc = SVC(kernel= 'rbf', gamma= 0.01, C= 10000)
dt = DecisionTreeClassifier(min_samples_split= 10, max_depth= 15, criterion ='entropy')
nb = GaussianNB(var_smoothing= 2.848035868435799e-08)
kn = KNeighborsClassifier(n_neighbors= 9)
rf = RandomForestClassifier(n_estimators= 200, min_samples_split= 5, max_depth = 25, criterion= "log_loss")

models = [
    ('lr', lr),
     ('svc', svc), ('nb', nb), ('rf', rf), ('dt', dt), ('kn', kn)]
stack = StackingClassifier(estimators=models, final_estimator=LogisticRegression( max_iter=1000,solver='saga', multi_class='multinomial'))
stack.fit(X_train, y_encoded2)
y_pred = stack.predict(X_val)
y_val = labelencoder.inverse_transform(y_val_encoded2)
y_pred = labelencoder.inverse_transform(y_pred)


print(classification_report(y_val, y_pred, target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))
plot_confusion_matrix(y_val, y_pred, class_names)
train_evaluate_encoded2(stack, {}, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
              precision    recall  f1-score   support

    BARBUNYA       0.95      0.91      0.93       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.95      0.92      0.94       276
    DERMASON       0.92      0.94      0.93       604
       HOROZ       0.94      0.96      0.95       319
       SEKER       0.96      0.96      0.96       339
        SIRA       0.88      0.87      0.87       441

    accuracy                           0.93      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.93      0.93      0.93      2286

Best parameters: {}
__________________________________________________________
Accuracy: 0.9282589676290464
              precision    recall  f1-score   support

    BARBUNYA       0.94      0.91      0.93       222
      BOMBAY       1.00      1.00      1.00        85
        CALI       0.94      0.92      0.93       276
    DERMASON       0.91      0.94      0.93       604
       HOROZ       0.93      0.96      0.94       319
       SEKER       0.97      0.96      0.96       339
        SIRA       0.88      0.86      0.87       441

    accuracy                           0.93      2286
   macro avg       0.94      0.94      0.94      2286
weighted avg       0.93      0.93      0.93      2286

__________________________________________________________
Cross-validation results: [0.92655738 0.93635171 0.93175853 0.92979003 0.92913386]
Mean accuracy: 0.9307182995568176
Cross-validation results on validation set: [0.90829694 0.91247265 0.94310722 0.9452954  0.90809628]
Mean accuracy on validation set: 0.923453699368389
Out[66]:
({},
 StackingClassifier(estimators=[('lr',
                                 LogisticRegression(C=100, max_iter=1000,
                                                    multi_class='multinomial',
                                                    penalty='l1',
                                                    solver='saga')),
                                ('svc', SVC(C=10000, gamma=0.01)),
                                ('nb',
                                 GaussianNB(var_smoothing=2.848035868435799e-08)),
                                ('rf',
                                 RandomForestClassifier(criterion='log_loss',
                                                        max_depth=25,
                                                        min_samples_split=5,
                                                        n_estimators=200)),
                                ('dt',
                                 DecisionTreeClassifier(criterion='entropy',
                                                        max_depth=15,
                                                        min_samples_split=10)),
                                ('kn', KNeighborsClassifier(n_neighbors=9))],
                    final_estimator=LogisticRegression(max_iter=1000,
                                                       multi_class='multinomial',
                                                       solver='saga')))

7. Wizualizacje dla wybranych modeli

In [26]:
#pip install pdpbox
#pip install lime
import lime
from lime import lime_tabular
import shap
from sklearn.tree import export_graphviz
import graphviz
from pdpbox import pdp, info_plots
import graphviz
from sklearn.inspection import PartialDependenceDisplay
from sklearn import tree 


def visualize_model_lime(best_model, X_train, X_val, class_names, sample_idx):
    # Create a LIME explainer
    explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, 
                                                       feature_names=X_train.columns.tolist(), 
                                                       class_names=class_names, 
                                                       discretize_continuous=True)
    
    # Explain the prediction for a sample from the validation set
    exp = explainer.explain_instance(X_val.values[sample_idx], 
                                     best_model.predict_proba, 
                                     num_features=len(X_train.columns), 
                                     top_labels=len(class_names))
    
    # Show the explanation
    exp.show_in_notebook()

def visualize_model_shap(best_model, X_train, X_val, class_names):   
    # Visualize using SHAP
    shap.initjs()
    explainer_shap = shap.TreeExplainer(best_model)
    shap_values = explainer_shap.shap_values(X_val)
    shap.summary_plot(shap_values, X_val, feature_names=X_train.columns, class_names=class_names)

def visualize_model_tree(best_model, X_train, X_val, class_names):
    #visualize tree
    tree1 = best_model
    feature_names = X_train.columns
    dot_data = tree.export_graphviz(tree1, 
                    feature_names=feature_names,  
                    class_names=class_names,  
                    filled=True, rounded=True,  
                    special_characters=True,
                    out_file=None,
                            )
    graph = graphviz.Source(dot_data)
    graph.format = "png"
    graph.render("tree")
    graph.view()
    #to chyba nie działa ale nie wiem jak ma działać, można przekleić z tego co tam robiłeś wcześniej jakoś to drzewo
        

def visualize_model_partial_dependence(best_model, X_train, features, target_class, class_names):
    display = PartialDependenceDisplay.from_estimator(best_model, X_train, features, target=target_class)
    display.plot()

7.1 Stacking

In [29]:
for index in range(7):
    visualize_model_lime(stack, X_train, X_val, class_names, index)
In [31]:
for index in range(7):
    visualize_model_lime(kn_best, X_train, X_val, class_names, index)

7.2 XGBOOST

In [27]:
visualize_model_shap(xgb_best, X_train, X_val, class_names)
In [35]:
class_names = ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']
In [ ]:
features = ['Compactness', 'ShapeFactor1', ('Compactness', 'ShapeFactor1')] 

for target_class in range(2):
    visualize_model_partial_dependence(xgb_best, X_train, features, target_class, class_names)
In [ ]:
features = ['Area', 'Perimeter', ('Area','Perimeter')] 

for target_class in range(7):
    visualize_model_partial_dependence(xgb_best, X_train, features, target_class, class_names)

nie wiem dlaczego powyżej się zrobiło wszystko to samo

In [ ]:
for index in range(7):
    visualize_model_lime(xgb_best, X_train, X_val, class_names, index)
In [37]:
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import label_binarize

7.3 ROC curves

In [76]:
# Convert labels to binary format
y_val_binarized = label_binarize(y_val, classes=np.unique(y_val))
n_classes = y_val_binarized.shape[1]

# Initialize subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10))

models = [xgb_best.predict_proba(X_val), #lr_best.predict_proba(X_val),
           rf_best.predict_proba(X_val), nb_best.predict_proba(X_val), dt_best.predict_proba(X_val), stack.predict_proba(X_val), kn_best.predict_proba(X_val)]
model_names = ['XGBoost Classifier', #'Logistic Regression',
                'Random Forest', 'Naive Bayes',  'Decision Tree', 'Stacking Classifier', 'KNeighbors Classifier']

for idx, (model, model_name) in enumerate(zip(models, model_names)):
    row = idx // 2
    col = idx % 2
    ax = axes[row, col]

    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_val_binarized[:, i], model[:, i])
        roc_auc = auc(fpr, tpr)

        ax.plot(fpr, tpr, lw=2, label=f'{class_names[i]} (AUC = {roc_auc:.2f})')

    ax.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
    ax.set_xlim([0.0, 1])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'ROC Curve for {model_name}')
    ax.legend(loc='lower right')
    ax.grid(True)

plt.tight_layout()
plt.show()
In [77]:
# Convert labels to binary format
y_val_binarized = label_binarize(y_val, classes=np.unique(y_val))
n_classes = y_val_binarized.shape[1]

# Initialize subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10))

models = [xgb_best.predict_proba(X_val),#lr_best.predict_proba(X_val),
           rf_best.predict_proba(X_val), nb_best.predict_proba(X_val), dt_best.predict_proba(X_val), stack.predict_proba(X_val), kn_best.predict_proba(X_val)]
model_names = ['XGBoost Classifier', #'Logistic Regression',
                'Random Forest', 'Naive Bayes',  'Decision Tree', 'Stacking Classifier', 'KNeighbors Classifier']

for idx, (model, model_name) in enumerate(zip(models, model_names)):
    row = idx // 2
    col = idx % 2
    ax = axes[row, col]

    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_val_binarized[:, i], model[:, i])
        roc_auc = auc(fpr, tpr)

        ax.plot(fpr, tpr, lw=2, label=f'{class_names[i]} (AUC = {roc_auc:.2f})')

    ax.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
    ax.set_xlim([0.0, 0.14])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(f'ROC Curve for {model_name}')
    ax.legend(loc='lower right')
    ax.grid(True)

plt.tight_layout()
plt.show()

7.4 DECISION TREE

moe nie diałać na windows, jest plik tree.png - mozna sobie obejrzec

In [44]:
#visualize_model_tree(dt_best, X_train, X_val, class_names)
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.994718 to fit
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.994718 to fit

8. Ostateczne testy na wybranych modelach

8.1 Stacking

In [43]:
#acuracy of stacking_best on test set
y_pred = stack.predict(X_test)
y_test = labelencoder.inverse_transform(y_test_encoded2)
y_pred = labelencoder.inverse_transform(y_pred)
print(accuracy_score(y_test, y_pred)   )
print(classification_report(y_test, y_pred, target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))
cm = confusion_matrix(y_test, y_pred, labels=class_names)
plot_confusion_matrix(y_test, y_pred, class_names)
0.9194699286442406
              precision    recall  f1-score   support

    BARBUNYA       0.95      0.91      0.93        96
      BOMBAY       0.97      1.00      0.99        37
        CALI       0.93      0.97      0.95       118
    DERMASON       0.91      0.92      0.91       259
       HOROZ       0.97      0.95      0.96       137
       SEKER       0.94      0.92      0.93       145
        SIRA       0.85      0.87      0.86       189

    accuracy                           0.92       981
   macro avg       0.93      0.93      0.93       981
weighted avg       0.92      0.92      0.92       981

9. sprawdzenie skuteczności modelu na zbiorze pomniejszonym o fasolki SIRA

In [59]:
#without SIRA
class_names = ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER']
data = data[data['Class'] != 'SIRA']
X_s = data.drop('Class', axis=1)
y_s = data['Class']
X_s_train, X_s_val, y_s_train, y_s_val = train_test_split(X_s, y_s, test_size=0.3, random_state=42)
y_s_val = labelencoder.fit_transform(y_s_val)
y_s_train = labelencoder.fit_transform(y_s_train)
X_s_train = X_s_train.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_s_val = X_s_val.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_s_train = scaling.fit_transform(X_s_train)
X_s_val = scaling.transform(X_s_val)
X_s_train = pd.DataFrame(X_s_train)
X_s_val = pd.DataFrame(X_s_val)

kn_s = KNeighborsClassifier(n_neighbors=8)
kn_s.fit(X_s_train, y_s_train)
y_s_pred = kn_s.predict(X_s_val)
print(accuracy_score(y_s_val, y_s_pred))
print(classification_report(y_s_val, y_s_pred))
0.9735253135160241
              precision    recall  f1-score   support

           0       0.97      0.93      0.95       325
           1       1.00      1.00      1.00       117
           2       0.92      0.97      0.94       394
           3       0.99      1.00      1.00       861
           4       0.98      0.96      0.97       456

    accuracy                           0.97      2153
   macro avg       0.97      0.97      0.97      2153
weighted avg       0.97      0.97      0.97      2153